Data Pull and Transformation

State Geographical Data

# lets grab the state prefix data.  We will put this on most of our datasets.
state_abb <- toupper(state.abb)
state_name <- state.name
state_region <- as.character(state.region)
state_division <- as.character(state.division)
state_prefix <- data.frame( state_abb,state_name, state_division,state_region)

str(state_prefix)
## 'data.frame':    50 obs. of  4 variables:
##  $ state_abb     : chr  "AL" "AK" "AZ" "AR" ...
##  $ state_name    : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ state_division: chr  "East South Central" "Pacific" "Mountain" "West South Central" ...
##  $ state_region  : chr  "South" "West" "West" "South" ...

Data Science Job

state_csv <- list.files("Data Pull and Transformation/data/Population and Salary/states/")
datascientist_salaries <- as_tibble() # Initializing a dataframe
x <- "Alaska.csv" # Using this just to test the code
for(x in state_csv){

  file_path <- paste0("Data Pull and Transformation/data/Population and Salary/states/",x)
  state_salaries <- read_csv(file_path) %>% 
                            mutate(
                                   `Salary Estimate` = as.character(`Salary Estimate`),
                                   Founded = as.character(Founded)
                                  )
  ### The data has -1 instead of NA's.  We are using NA's not -1.
  state_salaries[state_salaries == -1] <- NA
  
  datascientist_salaries <- union_all(datascientist_salaries,state_salaries)
  
  datascientist_salaries
}



### Creating column names
dss_headers <-  colnames(datascientist_salaries)

### Cleaning out these periods and lower the upper cases
new_col <-  gsub("\\ ","_",tolower(dss_headers))
colnames(datascientist_salaries) <- new_col



### We need the city and state for where the job is. Creating a dataframe
df_location <- as_tibble(str_split(datascientist_salaries$location,", ", simplify = TRUE))
location_name <- c("city","state","other")

### Adding the columns back to the original dataframe
colnames(df_location) <- location_name


datascientist_salaries <- cbind(datascientist_salaries,df_location)


# Cleaning Section
### Due to the volume of data we have we are going to remove the rows that have
### data in the column other and any NA in the salaries.
### When removing the other data set and the salaries with NA's we remove almost
### 100 records out of 12,537.  That feels right.

### We need the min and max salary estimate.  We will take the mid point in our 
### Analysis 



size_name <- c("min_size","max_size")
df_size <- str_split_fixed(datascientist_salaries$size, " to ",2)
colnames(df_size) <- size_name
df_size <- as.data.frame(df_size)


df_size <- df_size %>% 
             mutate(
                    min_size = case_when(min_size == "10000+ Employees" ~ "10000",
                                         min_size == "Unknown" ~ "0",
                                         min_size == "" ~ "0",
                                         TRUE ~ min_size),
                    max_size = str_remove(max_size," Employees"),
                    max_size = if_else(max_size == "", min_size, max_size),
                    min_size = as.numeric(min_size),
                    max_size = as.numeric(max_size)
             ) 


datascientist_salaries <- cbind(datascientist_salaries, df_size)

datascientist_salaries$job_title <- str_replace_all(datascientist_salaries$job_title,"[^[:graph:]]", " ") 
datascientist_salaries$job_description <- str_replace_all(datascientist_salaries$job_description,"[^[:graph:]]", " ") 

df_data_science_drop <-  datascientist_salaries %>%
            filter(state != "DC" & !is.na(salary_estimate)) %>% 
            left_join(state_prefix, by = c("city"="state_name")) %>% 
            mutate(
              state_abb = case_when(state =="" ~ state_abb,
                                    state == "Los Angeles" ~ "CA",
                                    TRUE ~ state)
            ) %>% 
            filter(
                  !is.na(state_abb) &
                  (
                    # tolower(job_function) %notin% c("data analyst","data scientist","data engineer","business analyst") |
                    str_detect(tolower(job_function), "data") == FALSE &
                    str_detect(tolower(job_function), "engineer")  == FALSE &
                    str_detect(tolower(job_function), "analyst")  == FALSE &
                    str_detect(tolower(job_function), "business")  == FALSE &
                    str_detect(tolower(job_function), "model")  == FALSE &
                    str_detect(tolower(job_function), "consult")  == FALSE &
                    str_detect(tolower(job_function), "analytic")  == FALSE &
                    str_detect(tolower(job_function), "finance")  == FALSE &
                    str_detect(tolower(job_function), "stat")  == FALSE &
                    str_detect(tolower(job_function), "chain")  == FALSE &
                    str_detect(job_function, "AI")  == FALSE &
                    ### Bringing in by title
                    str_detect(tolower(job_title), "data")  == FALSE &
                    str_detect(tolower(job_title), "engineer")  == FALSE &
                    str_detect(tolower(job_title), "analyst")  == FALSE &
                    str_detect(tolower(job_title), "business")  == FALSE &
                    str_detect(tolower(job_title), "model")  == FALSE &
                    str_detect(tolower(job_title), "consult")  == FALSE &
                    str_detect(tolower(job_title), "analytic")  == FALSE &
                    str_detect(tolower(job_title), "finance")  == FALSE &
                    str_detect(tolower(job_title), "stat")  == FALSE &
                    str_detect(job_title, "AI")  == FALSE 
                   )
                  ) 




df_data_science_jobs <-  datascientist_salaries %>%
            left_join(state_prefix, by = c("city"="state_name")) %>% 
            mutate(
              state_abb = case_when(state =="" ~ state_abb,
                                    #state == "Los Angeles" ~ "CA",
                                    str_count(state) != 2 ~ other,
                                    TRUE ~ state)
            ) %>% 
            filter(
                  state != "DC" & !is.na(salary_estimate) &  !is.na(state_abb) &
                  (
                    # tolower(job_function) %notin% c("data analyst","data scientist","data engineer","business analyst") |
                    str_detect(tolower(job_function), "data") |
                    str_detect(tolower(job_function), "engineer") |
                    str_detect(tolower(job_function), "analyst") |
                    str_detect(tolower(job_function), "business") |
                    str_detect(tolower(job_function), "model") |
                    str_detect(tolower(job_function), "consult") |
                    str_detect(tolower(job_function), "analytic") |
                    str_detect(tolower(job_function), "finance") |
                    str_detect(tolower(job_function), "stat") |
                    str_detect(tolower(job_function), "chain") |
                    str_detect(job_function, "AI") |
                    ### Bringing in by title
                    str_detect(tolower(job_title), "data") |
                    str_detect(tolower(job_title), "engineer") |
                    str_detect(tolower(job_title), "analyst") |
                    str_detect(tolower(job_title), "business") |
                    str_detect(tolower(job_title), "model") |
                    str_detect(tolower(job_title), "consult") |
                    str_detect(tolower(job_title), "analytic") |
                    str_detect(tolower(job_title), "finance") |
                    str_detect(tolower(job_title), "stat") |
                    str_detect(job_title, "AI") 
                   )
                  ) %>% 
            mutate(
                    salary_estimate = gsub("\\(","",salary_estimate),
                    salary_estimate = gsub("\\)","",salary_estimate),
                    min_salary = as.numeric(str_extract(salary_estimate, "\\d+"))*1000,
                    max_salary = as.numeric(str_extract(salary_estimate, "\\d+(?=\\K Glassdoor est.)"))*1000,
                    max_salary = if_else(is.na(max_salary),min_salary,max_salary),
                    avg_salary = (max_salary + min_salary) / 2,
                    avg_size = (max_size + min_size) / 2,
                    mega_job_title = case_when(
                                      str_detect(tolower(job_title), "entry") ~ "Entry",
                                      str_detect(tolower(job_title), "vp") ~ "Senior",
                                      str_detect(tolower(job_title), "vice president") ~ "Leader",
                                      str_detect(tolower(job_title), "manager") ~ "Leader",
                                      str_detect(tolower(job_title), "senior") ~ "Senior",
                                      str_detect(tolower(job_title), "sr") ~ "Senior",
                                      str_detect(tolower(job_title), "principal") ~ "Senior",
                                      str_detect(tolower(job_title), "lead") ~ "Senior",
                                      str_detect(tolower(job_title), "ii") ~ "Senior",
                                      str_detect(tolower(job_title), "2") ~ "Senior",
                                      str_detect(tolower(job_title),"associate") ~ "Senior",
                                      str_detect(tolower(job_title), "director") ~ "Leader",
                                      str_detect(tolower(job_title), "director") ~ "Leader",
                                      
                                      str_detect(tolower(job_function), "entry") ~ "Entry",
                                      str_detect(tolower(job_function), "vp") ~ "Senior",
                                      str_detect(tolower(job_function), "vice president") ~ "Leader",
                                      str_detect(tolower(job_function), "manager") ~ "Leader",
                                      str_detect(tolower(job_function), "senior") ~ "Senior",
                                      str_detect(tolower(job_function), "sr") ~ "Senior",
                                      str_detect(tolower(job_function), "principal") ~ "Senior",
                                      str_detect(tolower(job_function), "lead") ~ "Senior",
                                      str_detect(tolower(job_function), "ii") ~ "Senior",
                                      str_detect(tolower(job_function), "2") ~ "Senior",
                                      str_detect(tolower(job_function),"associate") ~ "Senior",
                                      str_detect(tolower(job_function), "director") ~ "Leader",
                                      str_detect(tolower(job_function), "director") ~ "Leader",
                                      TRUE ~ "Entry"
                            )
                   ) %>% 
            select(-other,-founded,-industry, -revenue,  
                   -type_of_ownership, -size, -salary_estimate, -location, -state, 
                   -state_division,-state_region) %>%
            left_join(state_prefix, by = c("state_abb"="state_abb")) %>%             
            relocate(job_description, .after = avg_salary) %>% 
            relocate(mega_job_title, .after = job_title) %>% 
            relocate(c(state_abb,state_name,state_division,state_region), .after = city)

str(df_data_science_jobs)
## 'data.frame':    8269 obs. of  17 variables:
##  $ job_title      : chr  "Data Analyst" "DATA SCIENCE CONSULTANT BIRMINGHAM" "Data Analyst" "Data Scientist" ...
##  $ mega_job_title : chr  "Entry" "Entry" "Entry" "Entry" ...
##  $ job_function   : chr  "Data Analyst" "Data Consultant" "Data Analyst" "Data Scientist" ...
##  $ company_name   : chr  "DirectPath\n4.1" "managementsolutions\n3.7" "Ascent Health" "Southern Company\n4.3" ...
##  $ sector         : chr  "Insurance" "Business Services" NA "Oil, Gas, Energy & Utilities" ...
##  $ city           : chr  "Birmingham" "Birmingham" "Birmingham" "Birmingham" ...
##  $ state_abb      : chr  "AL" "AL" "AL" "AL" ...
##  $ state_name     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ state_division : chr  "East South Central" "East South Central" "East South Central" "East South Central" ...
##  $ state_region   : chr  "South" "South" "South" "South" ...
##  $ min_size       : num  201 1001 0 10000 501 ...
##  $ max_size       : num  500 5000 0 10000 1000 10000 10000 200 200 200 ...
##  $ min_salary     : num  38000 52000 43000 63000 37000 78000 51000 37000 59000 44000 ...
##  $ max_salary     : num  77000 122000 87000 118000 85000 147000 104000 75000 122000 95000 ...
##  $ avg_salary     : num  57500 87000 65000 90500 61000 ...
##  $ job_description: chr  "About DirectPath  DirectPath guides employees to make better health care decisions with individualized educatio"| __truncated__ "United States  DATA SCIENCE CONSULTANT BIRMINGHAM  Birmingham / Internship / Number of vacancies: 2     You wil"| __truncated__ "Ascent Health manages multiple healthcare entities and we are currently hiring for a Laboratory Data Analyst po"| __truncated__ "Data Scientist Description This position is responsible for creating, enhancing, and deploying predictive model"| __truncated__ ...
##  $ avg_size       : num  350 3000 0 10000 750 ...

Census Data

### Household Median Income Pull
### Grabbing 9 years of Median Household Income (mhhi) and Population from Census API
hhincome_all <- as_tibble()
all_year <- 2011:2019

y <- 2019
for (y in all_year){
  hhincome <- get_acs(geography = "state", year = y, survey = "acs5",
                      variables = "B19013_001") %>% 
                      select(GEOID, NAME, estimate) %>% 
              mutate(
                ### Let's create an id that makes the tables distinct.
                ### census_id = paste0(y, substr(NAME,7,11)),
                year_census = y,
                date_census_start = as_date(paste0(y, "-01-01")),
                date_census_end = as_date(paste0(y,"-12-31")),
                ##zcta = substr(NAME,7,11),
                state_name = NAME,
                mhhi = estimate
              ) %>% 
              select(-GEOID, -estimate, -NAME)
  
  hhincome_all <- union_all(hhincome_all, hhincome)
}


### Population Pull
population_all <- as_tibble()
y <- 2019
for (y in all_year){
  population_census <- get_acs(geography = "state", year = y, survey = "acs5",
                      variables = "B01003_001") %>% 
                      select(GEOID, NAME, estimate) %>% 
              mutate(
                ### Let's create an id that makes the tables distinct.
                ### census_id = paste0(y, substr(NAME,7,11)),
                year_census = y,
                date_census_start = as_date(paste0(y, "-01-01")),
                date_census_end = as_date(paste0(y,"-12-31")),
                ## zcta = substr(NAME,7,11),
                state_name = NAME,
                population = estimate
              ) %>% 
              select(-GEOID, -estimate, -NAME)
  
  population_all <- union_all(population_all, population_census)
}



### lets combine household median income and population 
df_census_data <- population_all %>% 
                          left_join(hhincome_all, by = c("year_census" = "year_census",
                                                         "date_census_start"="date_census_start",
                                                         "date_census_end"="date_census_end",
                                                         "state_name"="state_name")) %>% 
                          mutate(
                                 census_current_flag = if_else(max(year_census)== year_census,TRUE,FALSE),
                                 median_household_income = if_else(is.na(mhhi),0,mhhi)
                                ) %>% left_join(state_prefix, by = c("state_name"="state_name")) %>% 
                          mutate(
                                  state_abb = if_else(is.na(state_abb),"DC",state_abb),
                                  state_region = if_else(is.na(state_region),"Northeast",state_region),
                                  state_division = if_else(is.na(state_division),"South Atlantic",state_division)
                                 ) %>% 
                          select( -mhhi) %>% 
                          select( year_census, date_census_start, date_census_end, state_abb,state_name, 
                                  state_region,state_division, population, median_household_income, census_current_flag)

str(df_census_data)
## tibble [468 × 10] (S3: tbl_df/tbl/data.frame)
##  $ year_census            : int [1:468] 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ date_census_start      : Date[1:468], format: "2011-01-01" "2011-01-01" ...
##  $ date_census_end        : Date[1:468], format: "2011-12-31" "2011-12-31" ...
##  $ state_abb              : chr [1:468] "AL" "AK" "AZ" "AR" ...
##  $ state_name             : chr [1:468] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ state_region           : chr [1:468] "South" "West" "West" "South" ...
##  $ state_division         : chr [1:468] "East South Central" "Pacific" "Mountain" "West South Central" ...
##  $ population             : num [1:468] 4747424 700703 6337373 2895928 36969200 ...
##  $ median_household_income: num [1:468] 42934 69014 50752 40149 61632 ...
##  $ census_current_flag    : logi [1:468] FALSE FALSE FALSE FALSE FALSE FALSE ...

Tax Data

# https://taxfoundation.org/publications/state-individual-income-tax-rates-and-brackets/
# https://taxfoundation.org/publications/federal-tax-rates-and-tax-brackets/

df_state_tax_long <- read.csv("Data Pull and Transformation/data/Tax/state_income_tax.csv") %>% 
                        mutate(single_brackets = as.numeric(single_brackets),
                               married_brackets = as.numeric(married_brackets))

df_federal_income_tax <- read.csv("Data Pull and Transformation/data/Tax/federal_income_tax.csv")


df_state_tax_wide <- df_state_tax_long %>% filter(state_abb %notin% c('AK','HI')) %>% 
                        group_by(state_abb) %>% 
                        mutate(
                          max_single_bracket = if_else(!is.na(lead(single_brackets, 1)),lead(single_brackets, 1),1000000),
                          max_married_bracket = if_else(!is.na(lead(married_brackets, 1)),lead(married_brackets, 1),1000000),
                          min_single_bracket = single_brackets,
                          min_married_bracket = married_brackets
                        ) %>% ungroup() %>% 
                        left_join(state_prefix) %>% 
                        select(state_abb,state_name,single_rates,min_single_bracket,max_single_bracket,min_married_bracket,max_married_bracket)


str(df_state_tax_wide)
## tibble [190 × 7] (S3: tbl_df/tbl/data.frame)
##  $ state_abb          : chr [1:190] "AL" "AL" "AL" "AZ" ...
##  $ state_name         : chr [1:190] "Alabama" "Alabama" "Alabama" "Arizona" ...
##  $ single_rates       : num [1:190] 0.02 0.04 0.05 0.03 0.03 0.04 0.05 0.08 0.02 0.04 ...
##  $ min_single_bracket : num [1:190] 0 500 3000 0 27272 ...
##  $ max_single_bracket : num [1:190] 500 3000 1000000 27272 54544 ...
##  $ min_married_bracket: num [1:190] 0 1000 6000 0 54544 ...
##  $ max_married_bracket: num [1:190] 1000 6000 1000000 54544 109088 ...
str(df_federal_income_tax)
## 'data.frame':    7 obs. of  5 variables:
##  $ rates               : num  0.1 0.12 0.22 0.24 0.32 0.35 0.37
##  $ min_single_bracket  : int  0 9951 40526 86376 164926 209426 523601
##  $ max_single_brackets : int  9950 40525 86375 164925 209425 523600 100000
##  $ min_married_brackets: int  0 19902 81052 172752 329852 418852 628301
##  $ max_married_brackets: int  19900 81050 172750 329850 418850 628300 100000

Zillow Data

df_zillow_city <- read_csv("Data Pull and Transformation/data/City/City_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv")[,-9:-248] %>% 
                        pivot_longer(
                          cols = contains("-"),
                          names_to = "date",
                          values_to = "house_price"
                        ) %>% 
                        mutate(
                          rent_cost = house_price*(.025*(1+.025)^360)/((1+.025)^360-1)
                          ) %>% mutate(city = RegionName) %>% 
                        select(city, State, StateName, date, Metro,CountyName,house_price, rent_cost)



df_zillow_state <- read_csv("Data Pull and Transformation/data/State/State_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv")[,-6:-245] %>% 
                        pivot_longer(
                          cols = contains("-"),
                          names_to = "date",
                          values_to = "house_price"
                        ) %>% 
                        mutate(
                          rent_cost = house_price*(.025*(1+.025)^360)/((1+.025)^360-1)
                          ) %>% mutate(state_name = RegionName,
                                       state_abb = StateName) %>% 
                        left_join(state_prefix) %>% 
                        select(state_abb, state_name, state_division, state_region,date, house_price, rent_cost)


### import Data
oneBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
twoBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
threeBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
fourBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
fiveBedroom <- read_csv('Data Pull and Transformation/data/Zip/Zip_zhvi_bdrmcnt_5_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')


### Clean Columns
oneBedroom <- oneBedroom[,c(1:3,6:9,262:312)]
twoBedroom <- twoBedroom[,c(1:3,6:9,262:312)]
threeBedroom <- threeBedroom[,c(1:3,6:9,262:312)]
fourBedroom <- fourBedroom[,c(1:3,6:9,262:312)]
fiveBedroom <- fiveBedroom[,c(1:3,6:9,262:312)]

### Add Bedroom count, reorder columns
oneBedroom <- oneBedroom %>% 
  mutate(bedrooms = 1) %>% 
  rename(Zip = RegionName) %>% 
  select(c(1:7,bedrooms,8:58))

twoBedroom <- twoBedroom %>% 
  mutate(bedrooms = 2) %>% 
  rename(Zip = RegionName) %>% 
  select(c(1:7,bedrooms,8:58))

threeBedroom <- threeBedroom %>% 
  mutate(bedrooms = 3) %>% 
  rename(Zip = RegionName) %>% 
  select(c(1:7,bedrooms,8:58))

fourBedroom <- fourBedroom %>% 
  mutate(bedrooms = 4) %>% 
  rename(Zip = RegionName) %>% 
  select(c(1:7,bedrooms,8:58))

fiveBedroom <- fiveBedroom %>% 
  mutate(bedrooms = 5) %>% 
  rename(Zip = RegionName) %>% 
  select(c(1:7,bedrooms,8:58))

oneBedroom$Zip <- clean.zipcodes(oneBedroom$Zip)
twoBedroom$Zip <- clean.zipcodes(twoBedroom$Zip)
threeBedroom$Zip <- clean.zipcodes(threeBedroom$Zip)
fourBedroom$Zip <- clean.zipcodes(fourBedroom$Zip)
fiveBedroom$Zip <- clean.zipcodes(fiveBedroom$Zip)

### Bind to one data frame, melt dates

allhomes <- rbind(oneBedroom, twoBedroom, threeBedroom, fourBedroom, fiveBedroom)
allhomes <- allhomes %>% arrange(State, Zip, bedrooms)

allhomes <- allhomes[which(!(allhomes$State == 'HI' | allhomes$State == 'AK' | allhomes$State == 'DC')),]
length(unique(allhomes$State))
## [1] 48
allhomesClean <- reshape2::melt(allhomes, id=c(1:8)) %>% 
  rename(house_expense = value, dates = variable)


### add zip code locations
data("zipcode")

allhomesLocation = merge(allhomesClean, zipcode, by.x = 'Zip', by.y = 'zip') # Then merge with the zipcode data


allhomesLocation <- merge(x = allhomesLocation, y = state_prefix, by.x = "state", by.y = "state_abb" , all.x = TRUE)

str(allhomesLocation)
## 'data.frame':    6219399 obs. of  17 variables:
##  $ state         : chr  "AL" "AL" "AL" "AL" ...
##  $ Zip           : chr  "35091" "35091" "35091" "35091" ...
##  $ RegionID      : num  73316 73316 73316 73316 73316 ...
##  $ SizeRank      : num  16685 16685 16685 16685 16685 ...
##  $ State         : chr  "AL" "AL" "AL" "AL" ...
##  $ City          : chr  "Kimberly" "Kimberly" "Kimberly" "Kimberly" ...
##  $ Metro         : chr  "Birmingham-Hoover" "Birmingham-Hoover" "Birmingham-Hoover" "Birmingham-Hoover" ...
##  $ CountyName    : chr  "Jefferson County" "Jefferson County" "Jefferson County" "Jefferson County" ...
##  $ bedrooms      : num  2 2 4 4 5 3 5 3 2 4 ...
##  $ dates         : Factor w/ 51 levels "2017-01-31","2017-02-28",..: 17 9 1 38 46 38 20 39 13 5 ...
##  $ house_expense : num  91319 86524 235258 269600 313362 ...
##  $ city          : chr  "Kimberly" "Kimberly" "Kimberly" "Kimberly" ...
##  $ latitude      : num  33.8 33.8 33.8 33.8 33.8 ...
##  $ longitude     : num  -86.8 -86.8 -86.8 -86.8 -86.8 ...
##  $ state_name    : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ state_division: chr  "East South Central" "East South Central" "East South Central" "East South Central" ...
##  $ state_region  : chr  "South" "South" "South" "South" ...

About the Data Data Science Job

Box Plot

df_data_science_jobs %>% filter(state_region %in% c("West","Northeast")) %>%
            ggplot(aes(state_abb, avg_salary)) + 
            geom_boxplot(color = "red", fill = "white") +
            stat_smooth(method = "lm") +
            labs(title = "Hot Spot States within US") +
            xlab("States") +
            ylab("Avg Salary") +
            scale_y_continuous(label = comma)  +
            theme_minimal() +
            theme(text = element_text(size= 20))

df_data_science_jobs %>% filter(state_region %in% c("Northeast","West","South", "North Central")) %>%
                                ggplot(aes(state_region, avg_salary)) + 
                                geom_boxplot(color = "red", fill = "white") +
                                stat_smooth(method = "lm") +
                                labs(title = "Salary Comparison by Regions") +
                                xlab("US Regions") +
                                ylab("Avg Salary") +
                                scale_y_continuous(label = comma)  +
                                theme_minimal() +
                                theme(text = element_text(size= 20))

Histogram

#now lets compare specific job titles and average salaries
#which one is considered better or more frequent? 

df_data_science_jobs %>% filter(mega_job_title %in% c("Entry","Senior","Leader") & avg_salary < 200000) %>%
                              ggplot() +
                              aes(x = avg_salary) +
                              geom_histogram(bins = 15, color = "red", fill = "white") +
                              scale_x_continuous(labels = scales::comma) +
                              labs(title = "Data Scientest",
                                   x = "Avg Salary",
                                   y = "Salary Count") +
                              theme_minimal() +
                              theme(text = element_text(size= 20))

df_data_science_jobs %>% filter(mega_job_title %in% c("Entry")  & avg_salary < 200000) %>%
                                  ggplot() +
                                  aes(x = avg_salary) +
                                  geom_histogram(bins = 20, color = "red", fill = "white") +
                                  scale_x_continuous(labels = scales::comma) +
                                  labs(title = "Entry Level Data Scientest",
                                   x = "Avg Salary",
                                   y = "Salary Count") +
                                  theme_minimal() +
                                  theme(text = element_text(size= 20))

df_data_science_jobs %>% filter(mega_job_title %in% c("Senior")  & avg_salary < 200000) %>%
                                  ggplot() +
                                  aes(x = avg_salary) +
                                  geom_histogram(bins = 20, color = "red", fill = "white") +
                                  scale_x_continuous(labels = scales::comma) +
                                  labs(title = "Senior Level Data Scientest",
                                   x = "Avg Salary",
                                   y = "Salary Count") +
                                  theme_minimal() +
                                  theme(text = element_text(size= 20))

df_data_science_jobs %>% filter(mega_job_title %in% c("Leader")  & avg_salary < 200000) %>%
                              ggplot() +
                              aes(x = avg_salary) +
                              geom_histogram(bins = 20, color = "red", fill = "white") +
                              scale_x_continuous(labels = scales::comma) +
                              labs(title = "Leader Level Data Scientest",
                                   x = "Avg Salary",
                                   y = "Salary Count") +
                              theme_minimal() +
                              theme(text = element_text(size= 20))

About the Census Data

Bar Chart

census_data <- df_census_data %>% filter(census_current_flag == TRUE & 
                                          state_abb != "AK",
                                          state_abb != "HI",
                                          state_abb != "DC") %>% 
                                mutate(state_name = tolower(state_name))


#now make an analysis from the historical data from past 9 years
census_data %>% ggplot(aes(reorder(state_abb,desc(population)), population)) +
            geom_col(color = "red", fill = "white") +
            labs(x = "State Name", y = "Population", title = "State Population") +
            theme_minimal() + 
            theme(axis.text.x = element_text(angle = 80, vjust = 0.5, hjust = 1)) +
            scale_y_continuous(labels = comma) +
            theme(axis.text.x = element_text(angle = 90),
                  text = element_text(size= 20))

# fig.height=5, fig.width=10
ggplotly(ggplot(census_data) +
          aes(x = population, y = median_household_income) +
          geom_point(shape = "bullet", size = 3.9, colour = "#B22222") +
          labs(x = "State Population",y = "Median Income",title = "Population vs. House Income") +
          scale_y_continuous(labels = scales::comma) +
          theme_minimal()  +
            theme(text = element_text(size= 20)))

About the Tax Data

# Federal Income Tax
str(df_federal_income_tax)
## 'data.frame':    7 obs. of  5 variables:
##  $ rates               : num  0.1 0.12 0.22 0.24 0.32 0.35 0.37
##  $ min_single_bracket  : int  0 9951 40526 86376 164926 209426 523601
##  $ max_single_brackets : int  9950 40525 86375 164925 209425 523600 100000
##  $ min_married_brackets: int  0 19902 81052 172752 329852 418852 628301
##  $ max_married_brackets: int  19900 81050 172750 329850 418850 628300 100000
# State Income Tax
str(df_state_tax_long)
## 'data.frame':    203 obs. of  5 variables:
##  $ state_abb       : chr  "AL" "AL" "AL" "AK" ...
##  $ single_rates    : num  0.02 0.04 0.05 0 0.03 0.03 0.04 0.05 0.08 0.02 ...
##  $ single_brackets : num  0 500 3000 0 0 ...
##  $ married_rates   : num  0.02 0.04 0.05 0 0.03 0.03 0.04 0.05 0.08 0.02 ...
##  $ married_brackets: num  0 1000 6000 0 0 ...

About the Zillow Data Set

### Create filtered data sets and maps
filtered <- allhomesLocation %>% 
  filter(bedrooms == 1 & between(dates, as.factor('2020-01-31'), as.factor('2020-12-31')) & 
           !is.na(house_expense) & house_expense < 1000000) %>% 
  group_by(Zip, state_name, latitude, longitude) %>% 
  summarise(house_expense = mean(house_expense))

filteredState <- allhomesLocation %>% 
  filter(bedrooms == 1 & between(dates, as.factor('2020-01-31'), as.factor('2020-12-31')) & 
           !is.na(house_expense)) %>% 
  group_by(state_name) %>% 
  summarise(house_expense = mean(house_expense))

us <- map_data("state") %>% mutate(region = str_to_title(region))
#US Map
ggplot(filtered, aes(map_id = state_name)) +
  geom_map(map = us, fill = "gray") + 
  expand_limits(x = us$long, y = us$lat) +
  geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
  coord_map() + 
  scale_fill_gradient(low="beige", high="blue") +
  theme_map() +
  ggtitle("2020 Average House Expense by Zip") +
  labs(color = "House Expense") +
  theme(plot.title = element_text(size=16))

#NE Map
ggplot(filtered, aes(map_id = state_name)) +
  geom_map(map = us, fill = "gray") + 
  expand_limits(x = us$long, y = us$lat) +
  geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
  coord_map() + 
  scale_fill_gradient(low="beige", high="blue") +
  theme_map() +
  xlim(-80,-67) + 
  ylim(37.5,47.5) +
  ggtitle("2020 Average House Expense by Zip - NE") +
  labs(color = "House Expense") +
  theme(plot.title = element_text(size=16))

#California Map
ggplot(filtered, aes(map_id = state_name)) +
  geom_map(map = us, fill = "gray") + 
  expand_limits(x = us$long, y = us$lat) +
  geom_point(aes(x = longitude, y = latitude, color = house_expense)) +
  coord_map() + 
  xlim(-125,-115) + 
  ylim(33, 42.5) +
  scale_fill_gradient(low="beige", high="blue") +
  ggtitle("2020 Average House Expense by Zip - California") +
  labs(color = "House Expense") +
  theme(plot.title = element_text(size=16)) +
  theme_map()

Word Cloud Data Science Salary

job_description <- df_data_science_jobs$job_description
head(job_description,1)
## [1] "About DirectPath  DirectPath guides employees to make better health care decisions with individualized education for selecting the right benefit plan, expert assistance in making informed care choices and rewards for sensible financial decisions. Position Summary The Data Analyst functions as a key player to ensure data accuracy for client facing reports & internal applications. The role will coordinate with Client Relationship Managers and Solutions Architect to develop and fix carrier, payroll, and other ad hoc reports based on client/carrier requirements using bSwift and SQL Server Management Studio (SSMS). The Data Analyst can be located in Birmingham, AL or Milwaukee, WI. Relocation assistance, for this position, is not available.     As an industry leader in the employee engagement space, DirectPath offers our full-time regular employees a comprehensive and competitive compensation and employee benefits package. With our contribution toward the monthly premium, employees and their families have the choice of a generous group PPO medical plan or a HDHP with a health savings account option. We also provide company paid long term disability, short term disability, basic life insurance and a critical illness insurance policy. Additional benefits an employee may choose are dental, vision, additional life insurance, accident insurance, additional critical illness insurance, pet insurance and flexible spending accounts. Retirement planning is important too. Employees are eligible to participate in the 401(k) on the first day of employment and eligible for the company match after 6 months. Employees also enjoy a flexible time off plan that allows a new hire up to 15 days plus 9 holidays in the first year! (prorated based on date of hire, calendar year)"

Tax Data Set

df_state_tax_wide %>% filter(max_single_bracket >= 100000 & min_single_bracket <= 100000) %>% 
                    ggplot(aes(map_id = state_name)) +
                    geom_map(map = us, aes(fill = single_rates)) +
                    expand_limits(x = us$long, y = us$lat) +
                    scale_fill_gradient(labels = percent) +
                    labs(fill = "Tax Rate") +
                    ylab("") +
                    xlab("") +
                    coord_map() +
                    ggtitle("Single Tax Rate") +
                    theme_map() +
                    theme(plot.title = element_text(size=22)) 

df_state_tax_wide %>% filter(max_married_bracket >= 100000 & min_married_bracket <= 100000) %>% 
                              ggplot(aes(map_id = state_name)) +
                              geom_map(map = us, aes(fill = single_rates)) +
                              expand_limits(x = us$long, y = us$lat) +
                              scale_fill_gradient(labels = percent) +
                              labs(fill = "Tax Rate") +
                              ylab("") +
                              xlab("") +
                              coord_map() +
                              ggtitle("Married Tax Rate") +
                              theme_map() +
                              theme(plot.title = element_text(size=22)) 

Exploratory Data Analysis

Min & Max Salary

#looking back at data scientists salaries
ggplot(df_data_science_jobs, mapping = aes(x = max_salary)) + 
            geom_freqpoly(mapping = aes(colour = state_region), binwidth = 500) +
            scale_x_continuous(labels = comma) +
            ggtitle("Maximum Salary Analysis") +
              labs(
                x = "Max Salary",
                y = "Max Salary Count",
                color = "State Region"
              ) + 
              theme_minimal()  +
               theme(text = element_text(size= 20))

ggplot(df_data_science_jobs, mapping = aes(x = min_salary)) + 
            geom_freqpoly(mapping = aes(colour = state_region), binwidth = 500) +
            scale_x_continuous(labels = comma) +
            ggtitle("Minimum Salary Analysis") +
                labs(
                  x = "Min Salary",
                  y = "Min Salary Count",
                  color = "State Region"
                ) + 
               theme_minimal() +
               theme(text = element_text(size= 20))

California vs. Texas

#lets compare to a texas! still high population but more spread out rural land 
state_data <- subset(df_data_science_jobs, state_abb %in% c("TX","CA"), select = c(state_abb, min_salary, max_salary, avg_salary))


ggplot(state_data) +
  aes(x = avg_salary, y = ..scaled.., colour = state_abb) +
  geom_density(adjust = 1L, fill = NA) +
  scale_color_hue(direction = 1) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  labs(title = "Texas vs. California Salary Comparison",
       x = "Avg Salary",
       y = "Scaled",
       color = "State Abbreviation") +
  theme_minimal() +
  theme(text = element_text(size= 20))

Wordcloud - Job Description

description <- tolower(df_data_science_jobs$job_description)

#sylFile <- "txt_files/ist_687_syllabus.txt"
# input <- readLines(description)



tail(description, 1)
## [1] "bachelor’s degree in a highly quantitative field (computer science, machine learning, operational research, statistics, mathematics, etc.) or equivalent professional or military experience experience with ml fields, e.g., natural language processing, computer vision, statistical learning theory 6+ years of industry experience in predictive modeling, data science, and analysis experience in an ml engineer or data scientist role building and deploying ml models or hands on experience developing deep learning models experience writing code in python, r, scala, java, c++ with documentation for reproducibility experience handling terabyte size datasets, diving into data to discover hidden patterns, using data visualization tools, writing sql, and working with gpus to develop models want to help the largest global enterprises derive business value through the adoption of artificial intelligence (ai) and machine learning (ml)? excited by using massive amounts of disparate data to develop ml models? eager to learn to apply ml to a diverse array of enterprise use cases? thrilled to be a part of amazon who has been pioneering and shaping the world’s ai/ml technology for decades?  at amazon web services (aws), we are helping large enterprises build ml models on the aws cloud. we are applying predictive technology to large volumes of data and against a wide spectrum of problems. aws professional services works together with aws customers to address their business needs using ai solutions.  aws professional services is a unique consulting team. we pride ourselves on being customer obsessed and highly focused on the ai enablement of our customers. if you have experience with ai, including building ml models, we’d like to have you join our team. you will get to work with an innovative company, with great teammates, and have a lot of fun helping our customers. a successful candidate will be a person who enjoys diving deep into data, doing analysis, discovering root causes, and designing long-term solutions.  this is a customer-facing role and you will be required to travel to client locations and deliver professional services as needed.   amazon is committed to a diverse and inclusive workplace. amazon is an equal opportunity employer and does not discriminate on the basis of race, national origin, gender, gender identity, sexual orientation, protected veteran status, disability, age, or other legally protected status. for individuals with disabilities who would like to request an accommodation, please visit https://www.amazon.jobs/en/disability/us.  pursuant to the san francisco fair chance ordinance, we will consider for employment qualified applicants with arrest and conviction records.  pursuant to the los angeles fair chance ordinance, we will consider for employment qualified applicants with arrest and conviction records.  for employees based in colorado, this position starts at $116,200 per year. a sign-on bonus and restricted stock units may be provided as part of the compensation package, in addition to a range of medical, financial, and/or other benefits, dependent on the position offered."
str(description)
##  chr [1:8269] "about directpath  directpath guides employees to make better health care decisions with individualized educatio"| __truncated__ ...
### Word cloud

#use TM package
#create the word list with counts
words.vec <- VectorSource(description)
words.corpus <- Corpus(words.vec)
words.corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 8269
tdm <- TermDocumentMatrix(words.corpus)
str(tdm)
## List of 6
##  $ i       : int [1:927689] 1 2 3 4 5 6 7 8 9 10 ...
##  $ j       : int [1:927689] 1 1 1 1 1 1 1 1 1 1 ...
##  $ v       : num [1:927689] 1 1 1 1 1 1 1 1 3 1 ...
##  $ nrow    : int 43461
##  $ ncol    : int 8269
##  $ dimnames:List of 2
##   ..$ Terms: chr [1:43461] "(prorated" "(ssms)." "401(k)" "about" ...
##   ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
##  - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
#explore what we created
str(tdm)
## List of 6
##  $ i       : int [1:927689] 1 2 3 4 5 6 7 8 9 10 ...
##  $ j       : int [1:927689] 1 1 1 1 1 1 1 1 1 1 ...
##  $ v       : num [1:927689] 1 1 1 1 1 1 1 1 3 1 ...
##  $ nrow    : int 43461
##  $ ncol    : int 8269
##  $ dimnames:List of 2
##   ..$ Terms: chr [1:43461] "(prorated" "(ssms)." "401(k)" "about" ...
##   ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
##  - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
inspect(tdm[1:20,1:20])
## <<TermDocumentMatrix (terms: 20, documents: 20)>>
## Non-/sparse entries: 63/337
## Sparsity           : 84%
## Maximal term length: 13
## Weighting          : term frequency (tf)
## Sample             :
##             Docs
## Terms         1 11 13 15 18  4 5  6  7 9
##   (prorated   1  0  0  0  0  0 0  0  0 0
##   (ssms).     1  0  0  0  0  0 0  0  0 0
##   401(k)      1  0  0  0  0  0 0  0  0 0
##   about       1  0  0  0  0  0 1  4  4 0
##   accuracy    1  0  0  0  0  0 0  1  1 0
##   additional  3  0  0  0  0  0 0  0  0 0
##   also        2  1  0  0  1  0 0  3  2 0
##   analyst     2  2  0  0  1  0 0  0  0 0
##   and        11  9  9 16 15 14 9 29 29 8
##   are         2  0  2  0  0  0 0  2  2 0
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
head(wordCounts)
## (prorated   (ssms).    401(k)     about  accident   account 
##         1         1        59      2482        11       152
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts)
##   and   the  data  with   for   our 
## 85747 50189 27548 18253 18151 17135
# wordcloud(names(wordCounts), wordCounts)

# now remove stuff we don't care about
words.corpus <- words.corpus %>% 
                    tm_map(content_transformer(tolower)) %>% 
                    tm_map(removePunctuation) %>% 
                    tm_map(removeNumbers) %>% 
                    tm_map(removeWords, stopwords("english")) %>% 
                    tm_map(removeWords, c("data","will","analytics","learning","analysis","position","role","job","new","work"))


# recreate the wordcount
tdm <- TermDocumentMatrix(words.corpus)
tdm
## <<TermDocumentMatrix (terms: 24842, documents: 8269)>>
## Non-/sparse entries: 736896/204681602
## Sparsity           : 100%
## Maximal term length: 146
## Weighting          : term frequency (tf)
# Summary on are clean data
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts, 20)
##    business        team  experience     science   solutions     support 
##        8901        7943        7791        5019        4582        3930 
## development  technology     company    services     machine  management 
##        3613        3600        3490        3318        3236        3057 
##        help information   customers    software     develop      people 
##        3013        2992        2787        2766        2760        2743 
##      across     analyst 
##        2742        2720
# Create word cloud
cloudFrame <- data.frame(word=names(wordCounts), freq = wordCounts)

wordcloud(names(wordCounts), wordCounts, min.freq = 5, max.words = 50, rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

Wordcloud - Job Title

description <- df_data_science_jobs$job_title

#sylFile <- "txt_files/ist_687_syllabus.txt"
# input <- readLines(description)

tail(description, 20)
##  [1] "Senior Associate, ServiceNow Consulting"                                      
##  [2] "Senior Associate, Information Security Engineer - Managed Security Services"  
##  [3] "Senior Business Analyst/Product Manager, Data & Analytics – Clinical/Research"
##  [4] "Sr. Consumer Data Analyst (Remote or in-office)"                              
##  [5] "Senior Data Engineer"                                                         
##  [6] "SENIOR BUSINESS ANALYST/PRODUCT MANAGER, DATA AND ANALYTICS PLATFORM"         
##  [7] "SENIOR BUSINESS ANALYST/PRODUCT MANAGER, DATA & ANALYTICS – CLINICAL/RESEARCH"
##  [8] "Data Architect Administrator (Remote)"                                        
##  [9] "Sr. Cloud Software Engineer - AI Engineering - Remote"                        
## [10] "Senior Financial Data Analyst"                                                
## [11] "Senior Associate, Cloud DevOps Engineer"                                      
## [12] "Entegral Data Enterprise Engineer/Architect (Remote)"                         
## [13] "Senior Data Analyst"                                                          
## [14] "Senior Machine Learning Engineer"                                             
## [15] "Manager, Full Stack Developer"                                                
## [16] "Senior Machine Learning Engineer"                                             
## [17] "Senior Statistician"                                                          
## [18] "Senior Machine Learning Engineer"                                             
## [19] "Statistician Technician"                                                      
## [20] "Senior Data Scientist - ProServe"
str(description)
##  chr [1:8269] "Data Analyst" "DATA SCIENCE CONSULTANT BIRMINGHAM" ...
### Word cloud

#use TM package
#create the word list with counts
words.vec <- VectorSource(description)
words.corpus <- Corpus(words.vec)
words.corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 8269
tdm <- TermDocumentMatrix(words.corpus)
str(tdm)
## List of 6
##  $ i       : int [1:30953] 1 2 3 4 2 5 1 2 2 6 ...
##  $ j       : int [1:30953] 1 1 2 2 2 2 3 3 4 4 ...
##  $ v       : num [1:30953] 1 1 1 1 1 1 1 1 1 1 ...
##  $ nrow    : int 2663
##  $ ncol    : int 8269
##  $ dimnames:List of 2
##   ..$ Terms: chr [1:2663] "analyst" "data" "birmingham" "consultant" ...
##   ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
##  - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
#explore what we created
str(tdm)
## List of 6
##  $ i       : int [1:30953] 1 2 3 4 2 5 1 2 2 6 ...
##  $ j       : int [1:30953] 1 1 2 2 2 2 3 3 4 4 ...
##  $ v       : num [1:30953] 1 1 1 1 1 1 1 1 1 1 ...
##  $ nrow    : int 2663
##  $ ncol    : int 8269
##  $ dimnames:List of 2
##   ..$ Terms: chr [1:2663] "analyst" "data" "birmingham" "consultant" ...
##   ..$ Docs : chr [1:8269] "1" "2" "3" "4" ...
##  - attr(*, "class")= chr [1:2] "TermDocumentMatrix" "simple_triplet_matrix"
##  - attr(*, "weighting")= chr [1:2] "term frequency" "tf"
inspect(tdm[1:20,1:20])
## <<TermDocumentMatrix (terms: 20, documents: 20)>>
## Non-/sparse entries: 51/349
## Sparsity           : 87%
## Maximal term length: 17
## Weighting          : term frequency (tf)
## Sample             :
##             Docs
## Terms        1 10 14 16 17 2 3 4 5 9
##   analyst    1  1  1  0  0 0 1 0 0 0
##   assurance  0  0  0  0  0 0 0 0 1 0
##   birmingham 0  0  0  0  0 1 0 0 0 0
##   consultant 0  0  0  0  0 1 0 0 0 0
##   data       1  1  1  1  1 1 1 1 0 0
##   engineer   0  0  0  1  1 0 0 0 0 1
##   junior     0  0  0  1  0 0 0 0 0 1
##   science    0  0  0  0  1 1 0 0 0 0
##   scientist  0  0  0  0  0 0 0 1 0 0
##   software   0  0  0  0  0 0 0 0 1 1
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
head(wordCounts)
##    analyst       data birmingham consultant    science  scientist 
##       2049       6199          3        162        548       2229
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts)
##      data scientist   analyst  engineer    senior analytics 
##      6199      2229      2049      1706      1058       611
# wordcloud(names(wordCounts), wordCounts)

# now remove stuff we don't care about
words.corpus <- words.corpus %>% 
                    tm_map(content_transformer(tolower)) %>% 
                    tm_map(removePunctuation) %>% 
                    tm_map(removeNumbers) %>% 
                    tm_map(removeWords, stopwords("english")) %>% 
                    tm_map(removeWords, c("data","engineer","scientist","science","analyst","analytics","business","engineering","software"))


# recreate the wordcount
tdm <- TermDocumentMatrix(words.corpus)
tdm
## <<TermDocumentMatrix (terms: 1976, documents: 8269)>>
## Non-/sparse entries: 15187/16324357
## Sparsity           : 100%
## Maximal term length: 33
## Weighting          : term frequency (tf)
# Summary on are clean data
m <- as.matrix(tdm)
wordCounts <- rowSums(m)
wordCounts <- sort(wordCounts, decreasing = TRUE)
head(wordCounts, 20)
##       senior     learning      manager      machine       remote intelligence 
##         1062          465          441          427          288          213 
##         lead    associate    developer     research     director   consultant 
##          211          210          204          184          181          175 
##    marketing    architect   specialist   operations      product            – 
##          173          151          138          131          120          113 
##    principal      systems 
##          108          105
# Create word cloud
cloudFrame <- data.frame(word=names(wordCounts), freq = wordCounts)

wordcloud(names(wordCounts), wordCounts, min.freq = 3, max.words = 50, rot.per = 0.35, 
          colors = brewer.pal(8, "Dark2"))

Multi-Linear Regression Model

Improving the Regression Model

# Creating the average rent cost
df_zillow_avg_state <- df_zillow_state %>% filter(between(as.Date(date),as.Date("2020-01-31"),as.Date("2020-12-31")), state_abb != "DC") %>% 
                        group_by(state_abb, state_name, state_division,state_region) %>% 
                        summarise(
                          rent_cost = mean(rent_cost),
                                  .groups = "drop"
                          ) %>% 
                        ungroup()

### Lets bring the census data into our df_data_science_job data set

model_data_set <- df_data_science_jobs %>% 
                        left_join(df_census_data) %>% 
                        filter(census_current_flag) %>% 
                        left_join(df_zillow_avg_state) %>% 
                        select(-job_description, -year_census, -date_census_start, -date_census_end,
                               -census_current_flag, -min_salary, -max_salary) %>% 
                        rename(mhhi = median_household_income)
## Joining, by = c("state_abb", "state_name", "state_division", "state_region")
## Joining, by = c("state_abb", "state_name", "state_division", "state_region")
model_data_science_1 <- lm(avg_salary ~ population + mhhi, model_data_set)
model_summary <-  summary(model_data_science_1)
model_summary
## 
## Call:
## lm(formula = avg_salary ~ population + mhhi, data = model_data_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79968 -14591   -128  13300 212080 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.648e+04  1.496e+03   31.06   <2e-16 ***
## population  5.722e-04  2.751e-05   20.80   <2e-16 ***
## mhhi        6.346e-01  2.216e-02   28.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22240 on 8266 degrees of freedom
## Multiple R-squared:  0.1356, Adjusted R-squared:  0.1354 
## F-statistic: 648.3 on 2 and 8266 DF,  p-value: < 2.2e-16
model_data_science_2 <- lm(avg_salary ~ population + mhhi + avg_size, model_data_set)
model_summary <-  summary(model_data_science_2)
model_summary
## 
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size, data = model_data_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -77385 -14787    108  12923 209688 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.856e+04  1.513e+03   25.48   <2e-16 ***
## population  5.992e-04  2.690e-05   22.28   <2e-16 ***
## mhhi        6.691e-01  2.170e-02   30.83   <2e-16 ***
## avg_size    1.078e+00  5.360e-02   20.11   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21720 on 8265 degrees of freedom
## Multiple R-squared:  0.1759, Adjusted R-squared:  0.1756 
## F-statistic:   588 on 3 and 8265 DF,  p-value: < 2.2e-16
model_data_science_3 <- lm(avg_salary ~ population + mhhi + avg_size + mega_job_title, model_data_set)
model_summary <-  summary(model_data_science_3)
model_summary
## 
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size + mega_job_title, 
##     data = model_data_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75202 -14006    681  12980 193745 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.072e+04  1.448e+03   21.22   <2e-16 ***
## population           7.027e-04  2.556e-05   27.49   <2e-16 ***
## mhhi                 7.187e-01  2.051e-02   35.05   <2e-16 ***
## avg_size             7.806e-01  5.136e-02   15.20   <2e-16 ***
## mega_job_titleLeader 2.236e+04  8.866e+02   25.23   <2e-16 ***
## mega_job_titleSenior 1.329e+04  5.384e+02   24.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20460 on 8263 degrees of freedom
## Multiple R-squared:  0.2685, Adjusted R-squared:  0.2681 
## F-statistic: 606.7 on 5 and 8263 DF,  p-value: < 2.2e-16
model_data_science_4 <- lm(avg_salary ~ population + mhhi + avg_size + mega_job_title + rent_cost, model_data_set)
model_summary <-  summary(model_data_science_4)
model_summary
## 
## Call:
## lm(formula = avg_salary ~ population + mhhi + avg_size + mega_job_title + 
##     rent_cost, data = model_data_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78107 -13716    832  12857 192633 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.641e+04  1.584e+03  22.989   <2e-16 ***
## population           5.582e-04  3.043e-05  18.342   <2e-16 ***
## mhhi                 5.655e-01  2.701e-02  20.938   <2e-16 ***
## avg_size             7.969e-01  5.116e-02  15.576   <2e-16 ***
## mega_job_titleLeader 2.241e+04  8.826e+02  25.394   <2e-16 ***
## mega_job_titleSenior 1.324e+04  5.360e+02  24.698   <2e-16 ***
## rent_cost            1.369e+00  1.580e-01   8.661   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20370 on 8262 degrees of freedom
## Multiple R-squared:  0.2751, Adjusted R-squared:  0.2746 
## F-statistic: 522.6 on 6 and 8262 DF,  p-value: < 2.2e-16

Final Multi Linear Regression Model

model_data_science_5 <- lm(avg_salary ~ population + mhhi + mega_job_title + state_region + rent_cost + avg_size, model_data_set)
model_summary <-  summary(model_data_science_5)
model_summary
## 
## Call:
## lm(formula = avg_salary ~ population + mhhi + mega_job_title + 
##     state_region + rent_cost + avg_size, data = model_data_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -74160 -13668    627  12698 198411 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.587e+04  1.657e+03  21.654  < 2e-16 ***
## population             5.364e-04  3.074e-05  17.448  < 2e-16 ***
## mhhi                   5.295e-01  2.910e-02  18.195  < 2e-16 ***
## mega_job_titleLeader   2.271e+04  8.797e+02  25.820  < 2e-16 ***
## mega_job_titleSenior   1.346e+04  5.346e+02  25.176  < 2e-16 ***
## state_regionNortheast -2.577e+03  7.553e+02  -3.412 0.000647 ***
## state_regionSouth      1.187e+03  5.590e+02   2.123 0.033800 *  
## state_regionWest      -6.822e+03  1.061e+03  -6.428 1.37e-10 ***
## rent_cost              2.253e+00  1.960e-01  11.496  < 2e-16 ***
## avg_size               8.143e-01  5.115e-02  15.918  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20280 on 8259 degrees of freedom
## Multiple R-squared:  0.282,  Adjusted R-squared:  0.2812 
## F-statistic: 360.5 on 9 and 8259 DF,  p-value: < 2.2e-16
model_coefficients <-  model_summary$coefficients

Validating the Model

RootMeanSq <- function(error){
  sqrt(mean(error^2))
}



model_predict <- predict(model_data_science_5, model_data_set)
model_error <- abs(model_data_set$avg_salary - model_predict)

mean(model_data_set$avg_salary)
## [1] 94260.91
RootMeanSq(model_error)
## [1] 20264.09
### Lets look at just Senior and Entry.  I think the model is doing really good here.
model_entry_senior <- model_data_set %>% filter(mega_job_title %in% c("Senior","Entry"))

mean(model_entry_senior$avg_salary)
## [1] 93002.67
model_predict <- predict(model_data_science_5, model_entry_senior)
model_error <- abs(model_entry_senior$avg_salary - model_predict)
RootMeanSq(model_error)
## [1] 19758.7

Model Correlation

# Setting the data set used for the model to a new dataframe
df_corr <- model_data_set

# Finding the unique values for job level and region for the correlation matrix
unique(df_corr$mega_job_title)
## [1] "Entry"  "Senior" "Leader"
unique(df_corr$state_region)
## [1] "South"         "West"          "Northeast"     "North Central"
# Setting 0, 1, 2 for each of the job levels
df_corr <- df_corr %>% mutate(job_title_num = case_when(
  df_corr$mega_job_title == "Entry" ~ 0,
  df_corr$mega_job_title == "Senior" ~ 1,
  df_corr$mega_job_title == "Leader" ~ 2
))

# Setting 0, 1, 2, 3 for each of the regions
df_corr <- df_corr %>% mutate(state_region_num = case_when(
  df_corr$state_region == "South" ~ 0,
  df_corr$state_region == "West" ~ 1,
  df_corr$state_region == "Northeast" ~ 2,
  df_corr$state_region == "North Central" ~ 3
))

# Selecting the only columns used for the matrix
df_corr <- df_corr[13:19]

# Changing the column names so they look nicer on the matrix
column_names <- c("Avg Salary", "Avg Size","Population", "Median Income","Rent Cost","Job Level", "Region")
colnames(df_corr) <- column_names

# Setting the dataframe to correlation
corr <- cor(df_corr)

# Plotting the correlation
ggcorrplot(corr, method = "circle", hc.order = TRUE, type = "upper") +
                scale_fill_gradient2(low = "purple", high = "salmon", mid = "white", midpoint = 0, 
                                   limit = c(-1,1), space = "Lab", name="Correlation") + 
                labs(title = "Correlation Matrix") + theme_minimal() + 
                theme(axis.title.x = element_blank(), axis.title.y = element_blank()) +
                  theme(text = element_text(size= 20))

Summarizing The Results

Cost of Living Analysis

### We are going to remove outliers based on the histogram.  WHich is job over 200k
cost_living <- df_data_science_jobs %>% filter(state_abb %notin% c('DC','AK') &
                                               avg_salary < 200000) %>% 
                      group_by(state_abb, state_name, state_division, state_region) %>%  
                      summarise(avg_salary = mean(avg_salary),
                                num_jobs = length(job_title),
                                .groups = "drop") %>% ungroup() %>% 
                      left_join(df_state_tax_wide) %>% 
                      filter(max_single_bracket >= avg_salary & min_single_bracket <= avg_salary) %>% 
                      select(state_abb, state_name, state_region, state_division, avg_salary, single_rates, num_jobs) %>% 
                      crossing(df_federal_income_tax) %>% 
                      filter(max_single_brackets >= avg_salary & min_single_bracket <= avg_salary) %>% 
                      mutate(
                            gross_salary = avg_salary,
                            state_tax_rate = single_rates,
                            federal_tax_rate = rates
                      ) %>% 
                      left_join(df_zillow_avg_state) %>% 
                      left_join(df_census_data) %>% filter(census_current_flag==TRUE) %>% 
                      mutate(
                            net_salary = gross_salary * (1 - (state_tax_rate + federal_tax_rate)) - rent_cost,
                            chge_salary = net_salary /gross_salary -1,
                            above_median = gross_salary / median_household_income - 1
                            ) %>% 
                      select(state_abb,state_name, median_household_income,above_median,
                             chge_salary, gross_salary, net_salary, state_tax_rate, federal_tax_rate, rent_cost, num_jobs)


fill <- c("Gross Salary" = "light blue", "Net Salary" = "salmon")
cost_living %>% ggplot() +
      geom_col(aes(reorder(state_abb,desc(gross_salary)), gross_salary, fill = "Gross Salary"), color = "black") +
      geom_col(aes(reorder(state_abb,desc(gross_salary)), net_salary, fill = "Net Salary"), color = "black") +
      scale_y_continuous(labels = scales::comma) +
      labs(title = "Gross vs. Net Salary by State",
           x = "State",
           y = "Salary",
           fill = "Legend") +
      scale_fill_manual(values = fill) +
      theme(axis.text.x = element_text(angle = 45),
            legend.position = "bottom",
            text = element_text(size= 20))

cost_living %>% ggplot(aes(reorder(state_abb,desc(above_median)), above_median)) +
      geom_col(fill = "#DA66AC", color = "black") +
      scale_y_continuous(labels = scales::percent) +
      labs(title = "Data Science Salary Over Median Household Income",
           x = "State",
           y = "Salary") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45),
            text = element_text(size= 20))

cost_living %>% ggplot(aes(reorder(state_abb,desc(num_jobs)), num_jobs)) +
      geom_col(fill = "#EE828A", color = "black") +
      labs(title = "# of Data Science Jobs",
           x = "State",
           y = "Salary") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 45),
            text = element_text(size= 20))